# coding:utf-8

import warnings
warnings.filterwarnings("ignore")

import sys
import jieba
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.externals import joblib
from gensim.models.word2vec import Word2Vec
from sklearn.model_selection import train_test_split



'''
function: 对每个句子的所有词向量取均值,来生成一个句子的vector
return: 返回句子的vector
'''
def build_sentence_vector(text, size, imdb_w2v):
    vec = np.zeros(size).reshape((1, size))
    count = 0
    for word in text:
        try:
            vec += imdb_w2v[word].reshape((1, size))    # 逐点求和
            count += 1.
        except KeyError:
            continue
    if count != 0:
        vec /= count
    return vec



'''
function: 计算词向量
'''
def get_train_vecs(x_train,x_test):
    n_dim = 300
    # 初始化模型和词表
    imdb_w2v = Word2Vec(size=n_dim, min_count=10)
    imdb_w2v.build_vocab(x_train)
    
    # 在评论训练集上建模
    #imdb_w2v.train(x_train)
    imdb_w2v.train(x_train,total_examples=imdb_w2v.corpus_count,epochs=imdb_w2v.epochs)
    train_vecs = np.concatenate([build_sentence_vector(z, n_dim, imdb_w2v) for z in x_train])
    np.save("svm_data/train_vecs.npy",train_vecs)
    #print(train_vecs.shape)
    
    #在测试集上训练
    #imdb_w2v.train(x_test)
    imdb_w2v.train(x_test,total_examples=imdb_w2v.corpus_count,epochs=imdb_w2v.epochs)
    imdb_w2v.save("svm_data/w2v_model/w2v_model.pkl")
 
    #Build test tweet vectors then scale
    test_vecs = np.concatenate([build_sentence_vector(z, n_dim,imdb_w2v) for z in x_test])
    np.save("svm_data/test_vecs.npy",test_vecs)
    #print(test_vecs.shape)

def get_data():
    train_vecs=np.load('svm_data/train_vecs.npy')
    y_train=np.load('svm_data/y_train.npy')
    test_vecs=np.load('svm_data/test_vecs.npy')
    y_test=np.load('svm_data/y_test.npy') 
    return train_vecs, y_train, test_vecs, y_test



'''
function: 构建待预测句子的向量
return: 返回句子向量
'''
def get_predict_vecs(words):
    n_dim = 300
    imdb_w2v = Word2Vec.load('svm_data/w2v_model/w2v_model.pkl')
    #imdb_w2v.train(words)
    train_vecs = build_sentence_vector(words, n_dim, imdb_w2v)
    #print(train_vecs.shape)
    return train_vecs



'''
function: 对单个句子进行情感判断
return: 返回分类结果
'''
def svm_predict(string):
    words = jieba.lcut(string)
    words_vecs = get_predict_vecs(words)
    clf = joblib.load('svm_data/svm_model/model.pkl')
    
    result = clf.predict(words_vecs)
    return result[0]

